Regression Model
LASSO
# Creation of CV folds
data_cv10 <- vfold_cv(Hotel_Reviews,v = 10)
# Lasso Model Spec with tune
lm_lasso_spec_tune <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>%
set_engine(engine = 'glmnet') %>%
set_mode('regression')
# Recipes
full_rec <- recipe(Reviewer_Average_Difference ~., data = Hotel_Reviews) %>%
step_rm(month)%>%
step_nzv(all_predictors()) %>% # removes variables with the same value
step_unknown(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
step_corr(all_numeric())%>%
step_dummy(all_nominal_predictors()) # creates indicator variables for categorical variables
# Workflows
lasso_wf_tune <- workflow() %>%
add_recipe(full_rec) %>%
add_model(lm_lasso_spec_tune)
# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
penalty(range = c(-4, 1)),
levels = 30)
tune_output <- tune_grid(
lasso_wf_tune,
resamples = data_cv10,
metrics = metric_set(rmse, mae),
grid = penalty_grid
)
autoplot(tune_output) + theme_classic()

# Summarize Model Evaluation Metrics (CV)
collect_metrics(tune_output)
## # A tibble: 60 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0001 mae standard 1.03 10 0.00129 Preprocessor1_Model01
## 2 0.0001 rmse standard 1.35 10 0.00172 Preprocessor1_Model01
## 3 0.000149 mae standard 1.03 10 0.00129 Preprocessor1_Model02
## 4 0.000149 rmse standard 1.35 10 0.00172 Preprocessor1_Model02
## 5 0.000221 mae standard 1.03 10 0.00129 Preprocessor1_Model03
## 6 0.000221 rmse standard 1.35 10 0.00172 Preprocessor1_Model03
## 7 0.000329 mae standard 1.03 10 0.00129 Preprocessor1_Model04
## 8 0.000329 rmse standard 1.35 10 0.00172 Preprocessor1_Model04
## 9 0.000489 mae standard 1.03 10 0.00129 Preprocessor1_Model05
## 10 0.000489 rmse standard 1.35 10 0.00171 Preprocessor1_Model05
## # … with 50 more rows
# Choose largest penalty value within 1 se
best_se_penalty <- select_by_one_std_err(tune_output, metric = 'mae', desc(penalty))
best_se_penalty
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.00788 mae standard 1.03 10 0.00126 Preprocessor1_Mod… 1.03 1.03
# Fit Final Model based on our full 500k dataset
final_wf_se <- finalize_workflow(lasso_wf_tune, best_se_penalty) # incorporates penalty value to workflow
final_fit_se <- fit(final_wf_se, data = Hotel_Reviews)
# Look at each variables' importance
tidy(final_fit_se) %>%
filter(estimate != 0.0000000) %>%
mutate(importance = abs(estimate)) %>%
arrange(desc(importance))
## # A tibble: 42 × 4
## term estimate penalty importance
## <chr> <dbl> <dbl> <dbl>
## 1 Review_Total_Negative_Word_Counts -0.608 0.00788 0.608
## 2 Review_Total_Positive_Word_Counts 0.362 0.00788 0.362
## 3 Reviewer_Nationality_Israel 0.358 0.00788 0.358
## 4 Reviewer_Nationality_United.Kingdom 0.227 0.00788 0.227
## 5 Reviewer_Nationality_Iran -0.192 0.00788 0.192
## 6 Reviewer_Nationality_Ireland 0.188 0.00788 0.188
## 7 Reviewer_continent_Asia -0.177 0.00788 0.177
## 8 Reviewer_continent_Americas 0.164 0.00788 0.164
## 9 Reviewer_Nationality_United.Arab.Emirates -0.162 0.00788 0.162
## 10 Reviewer_continent_Oceania 0.161 0.00788 0.161
## # … with 32 more rows
# Evaluating
final_fit_se %>% tidy() %>% filter(estimate != 0)
## # A tibble: 42 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -0.117 0.00788
## 2 Average_Score -0.0534 0.00788
## 3 Review_Total_Negative_Word_Counts -0.608 0.00788
## 4 Review_Total_Positive_Word_Counts 0.362 0.00788
## 5 Reviewer_Nationality_Belgium -0.0421 0.00788
## 6 Reviewer_Nationality_China 0.100 0.00788
## 7 Reviewer_Nationality_Cyprus 0.128 0.00788
## 8 Reviewer_Nationality_France -0.0176 0.00788
## 9 Reviewer_Nationality_Germany -0.0331 0.00788
## 10 Reviewer_Nationality_Hong.Kong -0.0306 0.00788
## # … with 32 more rows
tune_output %>% collect_metrics() %>% filter(penalty == (best_se_penalty %>% pull(penalty)))
## # A tibble: 2 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00788 mae standard 1.03 10 0.00126 Preprocessor1_Model12
## 2 0.00788 rmse standard 1.35 10 0.00169 Preprocessor1_Model12
# Visual residuals
lasso_mod_output <- final_fit_se %>%
predict(new_data = Hotel_Reviews) %>%
bind_cols(Hotel_Reviews) %>%
mutate(resid = Reviewer_Average_Difference - .pred)
ggplot(lasso_mod_output, aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()

glmnet_output <- final_fit_se %>% extract_fit_parsnip() %>% pluck('fit') # get the original glmnet output
lambdas <- glmnet_output$lambda
# Visualize LASSO lambda result
coefs_lambdas <-
coefficients(glmnet_output, s = lambdas ) %>%
as.matrix() %>%
t() %>%
as.data.frame() %>%
mutate(lambda = lambdas ) %>%
select(lambda, everything(), -`(Intercept)`) %>%
pivot_longer(cols = -lambda,
names_to = "term",
values_to = "coef") %>%
mutate(var = purrr::map_chr(stringr::str_split(term,"_"),~.[1]))
coefs_lambdas %>%
ggplot(aes(x = lambda, y = coef, group = term, color = var)) +
geom_line() +
geom_vline(xintercept = best_se_penalty %>% pull(penalty), linetype = 'dashed') +
theme_classic() +
theme(legend.position = "bottom", legend.text=element_text(size=8))

# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output$beta==0
# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
# Extract coefficient path (sorted from highest to lowest lambda)
this_coeff_path <- bool_predictor_exclude[row,]
# Compute and return the # of lambdas until this variable is out forever
ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1
})
# Create a dataset of this information and sort
var_imp_data <- tibble(
var_name = rownames(bool_predictor_exclude),
var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
## # A tibble: 263 × 2
## var_name var_imp
## <chr> <dbl>
## 1 Total_Number_of_Reviews 82
## 2 lat 82
## 3 Reviewer_Nationality_Antarctica 82
## 4 Reviewer_Nationality_Armenia 82
## 5 Reviewer_Nationality_Australia 82
## 6 Reviewer_Nationality_Cameroon 82
## 7 Reviewer_Nationality_Canada 82
## 8 Reviewer_Nationality_Cape.Verde 82
## 9 Reviewer_Nationality_Cayman.Islands 82
## 10 Reviewer_Nationality_Ecuador 82
## # … with 253 more rows
var_imp_data_delete_Nationality <- var_imp_data %>% filter(str_detect(var_name, "Reviewer_Nationality_")==FALSE, str_detect(var_name, "lat")==FALSE, str_detect(var_name, "Month_")==FALSE)
var_imp_data_delete_Nationality %>% arrange(desc(var_imp))
## # A tibble: 24 × 2
## var_name var_imp
## <chr> <dbl>
## 1 Total_Number_of_Reviews 82
## 2 Hotel_Country_unknown 82
## 3 Reviewer_continent_Europe 82
## 4 Reviewer_continent_unknown 82
## 5 season_unknown 82
## 6 Review_Total_Negative_Word_Counts 81
## 7 Review_Total_Positive_Word_Counts 76
## 8 Reviewer_continent_Asia 66
## 9 Average_Score 55
## 10 Reviewer_continent_Americas 55
## # … with 14 more rows
GAMs
# Remove month, year, nationality
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "month")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Month")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "day")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Reviewer_Nationality")]
# Create new variables
Hotel_Reviews$Review_Total_Positive_Word_Percentage <- Hotel_Reviews$Review_Total_Positive_Word_Counts / (Hotel_Reviews$Review_Total_Negative_Word_Counts + Hotel_Reviews$Review_Total_Positive_Word_Counts)
Hotel_Reviews$Review_Total_Positive_Word_Percentage <- formatC(Hotel_Reviews$Review_Total_Positive_Word_Percentage, digits = 2, format = "f")
Hotel_Reviews$Review_Total_Positive_Word_Percentage <- as.numeric(Hotel_Reviews$Review_Total_Positive_Word_Percentage)
Hotel_Reviews$Review_Total_Negative_Word_Percentage <- Hotel_Reviews$Review_Total_Negative_Word_Counts / (Hotel_Reviews$Review_Total_Negative_Word_Counts + Hotel_Reviews$Review_Total_Positive_Word_Counts)
Hotel_Reviews$Review_Total_Negative_Word_Percentage <- formatC(Hotel_Reviews$Review_Total_Positive_Word_Percentage, digits = 2, format = "f")
Hotel_Reviews$Review_Total_Negative_Word_Percentage <- as.numeric(Hotel_Reviews$Review_Total_Negative_Word_Percentage)
Hotel_Reviews <- Hotel_Reviews %>% filter(!Review_Total_Negative_Word_Percentage == "NaN")
Hotel_Reviews <- Hotel_Reviews %>% filter(!Review_Total_Positive_Word_Percentage == "NaN")
Hotel_Reviews <- Hotel_Reviews %>% na_if("") %>% na.omit
# Visualize non-linear before GAMs
Hotel_Reviews %>%
ggplot(aes(x = Review_Total_Negative_Word_Counts, y = Reviewer_Average_Difference, color = Reviewer_continent)) +
geom_point(alpha = 0.2) +
geom_smooth(span = 0.2, se = FALSE) +
theme_classic()

Hotel_Reviews %>%
ggplot(aes(x = Review_Total_Positive_Word_Counts, y = Reviewer_Average_Difference, color = Reviewer_continent)) +
geom_point(alpha = 0.2) +
geom_smooth(span = 0.2, se = FALSE) +
theme_classic()

GAMs with Counts
# Generalized Additive Regression (GAM) Model
gam_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
fit_gam_model <- gam_spec %>%
fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts) + s(Review_Total_Positive_Word_Counts) + s(Average_Score) + Reviewer_continent + season, data = Hotel_Reviews)
# Summary: Parameter (linear) estimates and then Smooth Terms (H0: no relationship)
fit_gam_model %>% pluck('fit') %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts) +
## s(Review_Total_Positive_Word_Counts) + s(Average_Score) +
## Reviewer_continent + season
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.164874 0.013996 -11.780 < 2e-16 ***
## Reviewer_continentAmericas 0.260702 0.014823 17.587 < 2e-16 ***
## Reviewer_continentAsia -0.161955 0.014401 -11.246 < 2e-16 ***
## Reviewer_continentEurope 0.160857 0.013774 11.678 < 2e-16 ***
## Reviewer_continentOceania 0.154822 0.015863 9.760 < 2e-16 ***
## seasonSpring 0.041426 0.005147 8.048 8.41e-16 ***
## seasonSummer 0.002059 0.005074 0.406 0.685
## seasonWinter 0.123795 0.005253 23.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Review_Total_Negative_Word_Counts) 8.926 8.997 16448.5 <2e-16 ***
## s(Review_Total_Positive_Word_Counts) 8.972 9.000 8469.4 <2e-16 ***
## s(Average_Score) 7.979 8.692 548.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.296 Deviance explained = 29.6%
## GCV = 1.6401 Scale est. = 1.64 n = 510506
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_model %>% pluck('fit') %>% mgcv::gam.check()

##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 12 iterations.
## The RMS GCV score gradient at convergence was 2.111654e-07 .
## The Hessian was positive definite.
## Model rank = 35 / 35
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Review_Total_Negative_Word_Counts) 9.00 8.93 0.98 0.055 .
## s(Review_Total_Positive_Word_Counts) 9.00 8.97 0.97 0.055 .
## s(Average_Score) 9.00 7.98 0.99 0.175
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot functions for each predictor
# Visualize: Look at the estimated non-linear functions
# Dashed lines are +/- 2 SEs
fit_gam_model %>% pluck('fit') %>% plot( pages = 1)

fit_gam_model_2 <- gam_spec %>%
fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts, k = 20) + s(Review_Total_Positive_Word_Counts, k = 20)+ Reviewer_continent + season + s(Average_Score, k = 20), data = Hotel_Reviews)
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_model_2 %>% pluck('fit') %>% mgcv::gam.check()

##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 11 iterations.
## The RMS GCV score gradient at convergence was 3.533854e-07 .
## The Hessian was positive definite.
## Model rank = 65 / 65
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Review_Total_Negative_Word_Counts) 19.0 16.9 1.00 0.52
## s(Review_Total_Positive_Word_Counts) 19.0 18.8 0.99 0.21
## s(Average_Score) 19.0 15.6 0.99 0.33
fit_gam_model_3 <- gam_spec %>%
fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts, k = 15) + s(Review_Total_Positive_Word_Counts, k = 15) + Reviewer_continent + season + s(Average_Score, k = 15), data = Hotel_Reviews)
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_model_3 %>% pluck('fit') %>% mgcv::gam.check()

##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 11 iterations.
## The RMS GCV score gradient at convergence was 3.898239e-07 .
## The Hessian was positive definite.
## Model rank = 50 / 50
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Review_Total_Negative_Word_Counts) 14.0 13.9 0.98 0.085 .
## s(Review_Total_Positive_Word_Counts) 14.0 14.0 0.97 0.055 .
## s(Average_Score) 14.0 13.5 0.98 0.065 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Our final GAM model with 20 breakpoints:
fit_gam_model_final <- gam_spec %>%
fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts, k = 10) + s(Review_Total_Positive_Word_Counts, k = 10)+ Reviewer_continent + season + s(Average_Score, k = 10), data = Hotel_Reviews)
fit_gam_model_final %>% pluck('fit') %>% plot( pages = 1)

GAMs with Percentage
# Generalized Additive Regression (GAM) Model
gam_percentage_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
fit_gam_percentage_model <- gam_percentage_spec %>%
fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Percentage) + s(Review_Total_Positive_Word_Percentage) + s(Average_Score) + Reviewer_continent + season, data = Hotel_Reviews)
# Summary: Parameter (linear) estimates and then Smooth Terms (H0: no relationship)
fit_gam_percentage_model %>% pluck('fit') %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Percentage) +
## s(Review_Total_Positive_Word_Percentage) + s(Average_Score) +
## Reviewer_continent + season
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.165343 0.013712 -12.058 < 2e-16 ***
## Reviewer_continentAmericas 0.266811 0.014519 18.376 < 2e-16 ***
## Reviewer_continentAsia -0.160449 0.014117 -11.366 < 2e-16 ***
## Reviewer_continentEurope 0.161555 0.013497 11.969 < 2e-16 ***
## Reviewer_continentOceania 0.155038 0.015542 9.975 < 2e-16 ***
## seasonSpring 0.040847 0.005044 8.099 5.57e-16 ***
## seasonSummer 0.002200 0.004973 0.442 0.658
## seasonWinter 0.120793 0.005149 23.458 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Review_Total_Negative_Word_Percentage) 7.545 7.546 0.978 0.552
## s(Review_Total_Positive_Word_Percentage) 1.453 1.454 0.085 0.906
## s(Average_Score) 8.083 8.747 702.516 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 34/35
## R-sq.(adj) = 0.323 Deviance explained = 32.3%
## GCV = 1.576 Scale est. = 1.576 n = 510506
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_percentage_model %>% pluck('fit') %>% mgcv::gam.check()

##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 15 iterations.
## The RMS GCV score gradient at convergence was 2.373486e-07 .
## The Hessian was not positive definite.
## Model rank = 34 / 35
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Review_Total_Negative_Word_Percentage) 9.00 7.54 0.97 0.025 *
## s(Review_Total_Positive_Word_Percentage) 9.00 1.45 0.97 0.035 *
## s(Average_Score) 9.00 8.08 1.01 0.715
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot functions for each predictor
# Visualize: Look at the estimated non-linear functions
# Dashed lines are +/- 2 SEs
fit_gam_percentage_model %>% pluck('fit') %>% plot( pages = 1)

Classification Model
Clean data for classification
# Create binary variable for classification
Hotel_Reviews$Reviewer_Average_Difference_Categorical <- ifelse(Hotel_Reviews$Reviewer_Average_Difference > 0, TRUE, FALSE)
Hotel_Reviews$Reviewer_Average_Difference_Categorical <- as.factor(Hotel_Reviews$Reviewer_Average_Difference_Categorical)
Hotel_Reviews <- Hotel_Reviews %>% filter(!is.na(Reviewer_continent))
# Remove all unnecessary variables + variables that create the Reviewer_Average_Difference_Categorical
#Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Reviewer_Nationality")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "lat")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "lng")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Reviewer_Average_Difference")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Review_Total_Positive_Word_Percentage")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Review_Total_Negative_Word_Percentage")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Reviewer_Score")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Average_Score")]
One Single Tree
data_fold <- vfold_cv(Hotel_Reviews, v = 10)
ct_spec_tune <- decision_tree() %>%
set_engine(engine = 'rpart') %>%
set_args(cost_complexity = tune(),
min_n = 2,
tree_depth = NULL) %>%
set_mode('classification')
data_rec <- recipe(Reviewer_Average_Difference_Categorical~ ., data = Hotel_Reviews)
data_wf_tune <- workflow() %>%
add_model(ct_spec_tune) %>%
add_recipe(data_rec)
param_grid <- grid_regular(cost_complexity(range = c(-5, 1)), levels = 10)
tune_res <- tune_grid(
data_wf_tune,
resamples = data_fold,
grid = param_grid,
metrics = metric_set(accuracy) #change this for regression trees
)
autoplot(tune_res) + theme_classic()

best_complexity <- select_by_one_std_err(tune_res, metric = 'accuracy', desc(cost_complexity))
data_wf_final <- finalize_workflow(data_wf_tune, best_complexity)
hotel_final_fit <- fit(data_wf_final, data = Hotel_Reviews)
tune_res %>%
collect_metrics() %>%
filter(cost_complexity == best_complexity %>% pull(cost_complexity))
## # A tibble: 1 × 7
## cost_complexity .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.000215 accuracy binary 0.729 10 0.000680 Preprocessor1_Model03
library("rpart.plot")
hotel_final_fit %>% extract_fit_engine() %>% rpart.plot()

tree_mod_highcp <- fit(
data_wf_tune %>%
update_model(ct_spec_tune %>% set_args(cost_complexity = .1)),
data = Hotel_Reviews
)
tree_mod_highcp %>% extract_fit_engine() %>% rpart.plot()

# The best single tree ever!
tree_mod_lowcp <- fit(
data_wf_tune %>%
update_model(ct_spec_tune %>% set_args(cost_complexity = .01)),
data = Hotel_Reviews
)
tree_mod_lowcp %>% extract_fit_engine() %>% rpart.plot()

Random Forest
# Recipe
data_rec <- recipe(Reviewer_Average_Difference_Categorical ~ ., data = Hotel_Reviews) %>%
step_nzv(all_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors())
# Model Specification
rf_spec <- rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(mtry = NULL,
trees = 1000,
min_n = 2,
probability = FALSE, # FALSE: get hard predictions (not needed for regression)
importance = 'impurity') %>%
set_mode('classification')
# Workflows
data_wf_mtry <- workflow() %>%
add_model(rf_spec) %>%
add_recipe(data_rec)
# Fit Models
data_fit_mtry <- fit(data_wf_mtry, data = Hotel_Reviews)
# Evaluation
# Custom Function to get OOB predictions, true observed outcomes and add a user-provided model label
rf_OOB_output <- function(fit_model, model_label, truth){
tibble(
.pred_class = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
Reviewer_Average_Difference_Categorical = truth,
label = model_label
)
}
#check out the function output
rf_OOB_output(data_fit_mtry, TRUE, Hotel_Reviews %>% pull(Reviewer_Average_Difference_Categorical))
## # A tibble: 510,506 × 3
## .pred_class Reviewer_Average_Difference_Categorical label
## <fct> <fct> <lgl>
## 1 FALSE FALSE TRUE
## 2 TRUE FALSE TRUE
## 3 TRUE FALSE TRUE
## 4 FALSE FALSE TRUE
## 5 FALSE FALSE TRUE
## 6 TRUE FALSE TRUE
## 7 FALSE FALSE TRUE
## 8 TRUE TRUE TRUE
## 9 FALSE FALSE TRUE
## 10 TRUE TRUE TRUE
## # … with 510,496 more rows
# Evaluate OOB Metrics
data_rf_OOB_output <- bind_rows(
rf_OOB_output(data_fit_mtry, TRUE, Hotel_Reviews %>% pull(Reviewer_Average_Difference_Categorical))
)
data_rf_OOB_output %>%
accuracy(truth = Reviewer_Average_Difference_Categorical, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.730
# OOB prediction error
confusion <- rf_OOB_output(data_fit_mtry,TRUE, Hotel_Reviews %>% pull(Reviewer_Average_Difference_Categorical)) %>%
conf_mat(truth = Reviewer_Average_Difference_Categorical, estimate = .pred_class)
confusion
## Truth
## Prediction FALSE TRUE
## FALSE 127926 54541
## TRUE 83074 244965
# Variable importance measurement
model_output <-data_fit_mtry %>%
extract_fit_engine()
model_output %>%
vip(num_features = 30) + theme_classic() #based on impurity

model_output %>% vip::vi() %>% head()
## # A tibble: 6 × 2
## Variable Importance
## <chr> <dbl>
## 1 Review_Total_Negative_Word_Counts 42371.
## 2 Review_Total_Positive_Word_Counts 23336.
## 3 Total_Number_of_Reviews 10535.
## 4 Additional_Number_of_Scoring 10301.
## 5 Total_Number_of_Reviews_Reviewer_Has_Given 7163.
## 6 year 2011.
model_output %>% vip::vi() %>% tail()
## # A tibble: 6 × 2
## Variable Importance
## <chr> <dbl>
## 1 Hotel_Country_Netherlands 563.
## 2 Hotel_Country_Italy 562.
## 3 Reviewer_continent_Oceania 404.
## 4 Hotel_Country_new 0
## 5 Reviewer_continent_new 0
## 6 season_new 0
ggplot(Hotel_Reviews, aes(x = Reviewer_Average_Difference_Categorical, y = Review_Total_Negative_Word_Counts)) +
geom_violin() + theme_classic()

ggplot(Hotel_Reviews, aes(x = Reviewer_Average_Difference_Categorical, y = Review_Total_Positive_Word_Counts)) +
geom_violin() + theme_classic()

LASSO for Logistic
# Set reference level (to the outcome you are NOT interested in)
Hotel_Reviews <- Hotel_Reviews%>%
mutate(Reviewer_Average_Difference_Categorical = relevel(factor(Reviewer_Average_Difference_Categorical), ref='FALSE'))
data_cv10 <- vfold_cv(Hotel_Reviews, v = 10)
# Logistic LASSO Regression Model Spec
logistic_lasso_spec_tune <- logistic_reg() %>%
set_engine('glmnet') %>%
set_args(mixture = 1, penalty = tune()) %>%
set_mode('classification')
# Recipe
logistic_rec <- recipe(Reviewer_Average_Difference_Categorical ~ ., data = Hotel_Reviews) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Workflow (Recipe + Model)
log_lasso_wf <- workflow() %>%
add_recipe(logistic_rec) %>%
add_model(logistic_lasso_spec_tune)
# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
penalty(range = c(-5, 1)), #log10 transformed (kept moving min down from 0)
levels = 100)
tune_output <- tune_grid(
log_lasso_wf, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(roc_auc,accuracy),
control = control_resamples(save_pred = TRUE, event_level = 'second'),
grid = penalty_grid # penalty grid defined above
)
# Visualize Model Evaluation Metrics from Tuning
autoplot(tune_output) + theme_classic()

# Select Penalty
best_se_penalty <- select_by_one_std_err(tune_output, metric = 'roc_auc', desc(penalty)) # choose penalty value based on the largest penalty within 1 se of the highest CV roc_auc
best_se_penalty
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.0248 roc_auc binary 0.776 10 0.000621 Preprocessor1_Mo… 0.777 0.776
# Fit Final Model
final_fit_se <- finalize_workflow(log_lasso_wf, best_se_penalty) %>% # incorporates penalty value to workflow
fit(data = Hotel_Reviews)
final_fit_se %>% tidy() %>%
filter(estimate == 0)
## # A tibble: 15 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 Additional_Number_of_Scoring 0 0.0248
## 2 year 0 0.0248
## 3 Total_Number_of_Reviews 0 0.0248
## 4 Total_Number_of_Reviews_Reviewer_Has_Given 0 0.0248
## 5 Hotel_Country_France 0 0.0248
## 6 Hotel_Country_Italy 0 0.0248
## 7 Hotel_Country_Netherlands 0 0.0248
## 8 Hotel_Country_Spain 0 0.0248
## 9 Hotel_Country_United.Kingdom 0 0.0248
## 10 Reviewer_continent_Americas 0 0.0248
## 11 Reviewer_continent_Europe 0 0.0248
## 12 Reviewer_continent_Oceania 0 0.0248
## 13 season_Spring 0 0.0248
## 14 season_Summer 0 0.0248
## 15 season_Winter 0 0.0248
final_fit_se %>% tidy() %>%
filter(estimate != 0)
## # A tibble: 4 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 0.371 0.0248
## 2 Review_Total_Negative_Word_Counts -0.847 0.0248
## 3 Review_Total_Positive_Word_Counts 0.532 0.0248
## 4 Reviewer_continent_Asia -0.125 0.0248
glmnet_output <- final_fit_se %>% extract_fit_engine()
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output$beta==0
# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
# Extract coefficient path (sorted from highest to lowest lambda)
this_coeff_path <- bool_predictor_exclude[row,]
# Compute and return the # of lambdas until this variable is out forever
ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1
})
# Create a dataset of this information and sort
var_imp_data <- tibble(
var_name = rownames(bool_predictor_exclude),
var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
## # A tibble: 18 × 2
## var_name var_imp
## <chr> <dbl>
## 1 year 63
## 2 Hotel_Country_Spain 63
## 3 season_Summer 63
## 4 Review_Total_Negative_Word_Counts 62
## 5 Review_Total_Positive_Word_Counts 58
## 6 Reviewer_continent_Asia 47
## 7 Reviewer_continent_Americas 39
## 8 Total_Number_of_Reviews_Reviewer_Has_Given 36
## 9 season_Winter 36
## 10 Hotel_Country_Netherlands 32
## 11 Hotel_Country_Italy 31
## 12 Additional_Number_of_Scoring 22
## 13 Hotel_Country_United.Kingdom 20
## 14 season_Spring 19
## 15 Reviewer_continent_Europe 18
## 16 Reviewer_continent_Oceania 14
## 17 Hotel_Country_France 10
## 18 Total_Number_of_Reviews 9
# CV results for "best lambda"
tune_output %>%
collect_metrics() %>%
filter(penalty == best_se_penalty %>% pull(penalty))
## # A tibble: 2 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0248 accuracy binary 0.692 10 0.000738 Preprocessor1_Model057
## 2 0.0248 roc_auc binary 0.776 10 0.000621 Preprocessor1_Model057
# Count up number of T/F in the training data
Hotel_Reviews %>%
count(Reviewer_Average_Difference_Categorical) # Name of the outcome variable goes inside count()
## # A tibble: 2 × 2
## Reviewer_Average_Difference_Categorical n
## <fct> <int>
## 1 FALSE 211000
## 2 TRUE 299506
# Compute the NIR
299575/(299575+211057)
## [1] 0.5866749
# Soft Predictions on Training Data
final_output <- final_fit_se %>%
predict(new_data = Hotel_Reviews, type='prob') %>%
bind_cols(Hotel_Reviews)
final_output %>%
ggplot(aes(x = Reviewer_Average_Difference_Categorical, y = .pred_TRUE)) +
geom_boxplot()

# Use soft predictions
final_output %>%
roc_curve(Reviewer_Average_Difference_Categorical,.pred_TRUE,event_level = 'second') %>%
autoplot()

# Thresholds in terms of reference level
threshold_output <- final_output %>%
threshold_perf(truth = Reviewer_Average_Difference_Categorical, estimate = .pred_FALSE, thresholds = seq(0,1,by=.01))
# J-index v. threshold for reviewer_Average_Difference_Categorical
threshold_output %>%
filter(.metric == 'j_index') %>%
ggplot(aes(x = .threshold, y = .estimate)) +
geom_line() +
labs(y = 'J-index', x = 'threshold') +
theme_classic()

threshold_output %>%
filter(.metric == 'distance') %>%
arrange(.estimate)
## # A tibble: 101 × 4
## .threshold .metric .estimator .estimate
## <dbl> <chr> <chr> <dbl>
## 1 0.4 distance binary 0.160
## 2 0.39 distance binary 0.162
## 3 0.41 distance binary 0.169
## 4 0.38 distance binary 0.176
## 5 0.42 distance binary 0.186
## 6 0.37 distance binary 0.200
## 7 0.43 distance binary 0.207
## 8 0.44 distance binary 0.233
## 9 0.36 distance binary 0.235
## 10 0.45 distance binary 0.261
## # … with 91 more rows
log_metrics <- metric_set(accuracy,sens,yardstick::spec)
#Accuracy + Specificity + Sensitivity
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(Reviewer_Average_Difference_Categorical
), threshold = 0.40)) %>%
log_metrics(truth = Reviewer_Average_Difference_Categorical
, estimate = .pred_class, event_level = 'second')
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.718
## 2 sens binary 0.720
## 3 spec binary 0.715